####################################################################################

#What You See and What You Get
#This file conducts the analysis for W2 survey
#Creates the analyses used for Table and Figures in the main paper and appendix
#Conducts the analyses for data cited in the paper/appendix

####################################################################################

rm(list=ls())
library(AER)
library(tidyverse)
library(readxl)
library(lubridate)
library(sandwich)
library(xtable)
library(corrgram)
library(forcats)
library(estimatr)
library(fastDummies)
library(randomizr)
library(ANOVAreplication)
library(miceadds)
library(clubSandwich)
library(texreg)
options(scipen=999)

source("Code/functions.R")

####################################################################################
# Read W2 dataset (and perform changes)                                       ######
####################################################################################

load("Data/surveyW2-WhatYouSee.Rda")
print(comment(survey))
survey <- as.data.frame(survey)

# Create index of incumbent indices (unplanned)
# Following the same structure for the individual indices
set.seed(1977)
#PT
pt_incumbent_index_vars <- imp.mice(survey %>% dplyr::select(lula_eval, dilma_eval, party_pt_r, 
                                                      votept_2018_d)) 
survey <- survey %>% bind_cols(
  pt_incumbent_index = calculate_mean_effects_index(Z = survey$Z
  , outcome_mat = pt_incumbent_index_vars, greedy = TRUE ))

#Treat
incumbent_treat_index_vars <- imp.mice(survey %>% dplyr::select(dilma_eval, paes_eval))
survey <- survey %>% bind_cols(
  treat_incumbent_index = calculate_mean_effects_index(Z = survey$Z
  , outcome_mat = incumbent_treat_index_vars , greedy = TRUE ))

#Current
incumbent_current_index_vars <- imp.mice(survey %>% dplyr::select(bolso_eval, crivella_eval, witzel_eval))
survey <- survey %>% bind_cols(
  current_incumbent_index = calculate_mean_effects_index(Z = survey$Z
  , outcome_mat = incumbent_current_index_vars , greedy = TRUE ))

# Create index of mobilization (unplanned)
mobilization_index_vars <- imp.mice(survey %>% dplyr::select(talk_cand,
                                                      client_support,
                                                      turnout_2016,
                                                      turnout_2018,
                                                      turnout_2020))
survey <- survey %>% bind_cols(
  mobilization_index = calculate_mean_effects_index(Z = survey$Z, 
                       outcome_mat = mobilization_index_vars, greedy = TRUE ))

surveyw2 <- survey
rm(survey)


####################################################################################
# Declare variables for W2 (controls and outcomes)                            ######
####################################################################################

# Declare labels for control and balance variables
controls <- pre.treat <-  c("idade", 
                            "sexor", 
                            "race_bin", 
                            "cadunico",
                            "formal_pre",
                            "av_prel")

bal.labels <- c("Age",
                "Sex (male)",
                "Race (white)", 
                "Cadunico",
                "Years in Formal Employment", 
                "Avg. Formal Wages")

####################################################################################
# Declare outcomes for analysis                                               ######
####################################################################################
outcomes1 <- c("pt_incumbent_index","treat_incumbent_index", 
               "current_incumbent_index", "mobilization_index")
outcomes2 <- c("resp_lula_dilma_pt", "resp_paes_pezao_temer_mdb") 
outcomes3 <- c("lula_eval", "dilma_eval", "party_pt_r", "votept_2018_d",
               "paes_eval", 
               "bolso_eval", "crivella_eval", "witzel_eval",
               "talk_cand",
               "client_support",
               "turnout_2016",
               "turnout_2018",
               "turnout_2020") 
outcomes4 <- c("voteinc_2016_d", "votepaes_2020_d", "pezao_eval")

#These definitions are needed below, in different parts of the code
the.fam <- paste("outcomes",1:4,sep="")
outcomes <- do.call("c",sapply(paste("outcomes",1:4,sep=""),"get"))
out.class <- paste("H",1:4,sep="")
names(outcomes)<-NULL
out.selected <- c(1,2) #select the main outcomes  
the.labels <-  c("Workers' Party Eval. Index", "Incumbent at Treatment Evaluation. Index",
                 "Current Inc. Eval Index", 
                 "Mobilization Index",
                 "Attribution to Lula/Dilma/PT", 
                 "Attribution to Paes/Pezao/Temer/MDB",
                 "Lula Evaluation", "Dilma Evaluation",
                 "Workers' Party ID","Vote Haddad 2018",
                 "Paes Evaluation", 
                 "Bolsonaro Evaluation", "Crivela Evaluation", "Witzel Evaluation",
                 "Talk to Candidate","Support Candidate",
                 "Turnout 2016","Turnout 2018","Expected Turnout 2020", 
                 "Vote Pedro Paulo 16", "Vote Paes 2020", "Pezão Evaluation")

# Check whether labels are correct
main <- rep(0,length(outcomes))
main[out.selected] <- 1
var.labels<- data.frame(outcomes,the.labels,main)
print(var.labels)
save(var.labels,file="Routputs/out-W2-variablelabels.RData")

# Print summary statistics 
sumtab <- summary_stats(data=subset(surveyw2,select=c("Z","D",pre.treat,outcomes)))
print(round(sumtab,2))
xsumtab <- xtable(sumtab,digits=c(0,0,2,2,2,2,2,0))
label(xsumtab) <- paste("tab:summary",sep="")
caption(xsumtab) <- paste("Descriptive Statistics",sep="")
print(xsumtab,
      type="latex",
      hline.after=1,
      caption.placement="top",
      file="Tables/tab-W2-descriptives-raw.tex")

########## Descriptive prior to re-scaling

#Data cited in paper
control <- surveyw2 %>% dplyr::filter(Z == 0)
treat <- surveyw2 %>% dplyr::filter(Z == 1)

control <- surveyw2 %>% dplyr::filter(Z == 0)
treat <- surveyw2 %>% dplyr::filter(Z == 1)
control_pure <- control %>% dplyr::filter(!fieldID %in% treat$fieldID)

table(control_pure$improved_lives)
table(control_pure$dream)

prop.table(table(treat$improved_lives))
prop.table(table(control_pure$improved_lives))*100
31.611570 + 51.446281

prop.table(table(control_pure$dream))*100

hist(surveyw2$improved_lives)
ks.test(surveyw2$improved_lives, control$dilma_eval)
ks.test(control$paes_eval, control$lula_eval)
ks.test(control$paes_eval, control$pezao_eval)

##################################

#Re-scaling outcomes
surveyw2.unscaled <- surveyw2
surveyw2[outcomes] <- scalev(outcomes=outcomes,the.data=surveyw2)

#Standardized Descriptive Table
sumtab <- summary_stats(data=subset(surveyw2,select=c("Z","D",pre.treat,outcomes)))
print(round(sumtab,2))
xsumtab <- xtable(sumtab, digits=c(0,0,2,2,2,2,2,0))
label(xsumtab) <- paste("tab:summary",sep="")
caption(xsumtab) <- paste("Descriptive Statistics (Standardized)",sep="")
print(xsumtab,
      type="latex",
      hline.after=1,
      caption.placement="top",
      file="Tables/tab-W2-descriptives-std.tex")

####################################################################################
# Balance by Lottery                                                          ######
####################################################################################
the.lotteries <- unique(surveyw2$id_edital)
# Loop over lotteries  #
bal.stats <- list() #object to store balance results
for(ii in 1:length(the.lotteries)){ 
  print(the.lotteries[ii])
  lot <- subset(surveyw2,id_edital==the.lotteries[ii])
  #Implements an F-test
  the.formula <- formula(paste("Z~",paste(pre.treat,collapse="+")))
  m1 <- lm_robust(the.formula, 
                  data = lot, 
                  se_type = "stata")
  #Standard F-test 
  the.wald <- lmtest::waldtest(m1,  test = c("F"))
  print(the.wald)
  # Compute the p-value of the F-tests based on the permutation distribution of W
  W_obs <- lmtest::waldtest(m1,  test = c("F"))[2,3] #store f-test for boostrapped p-values 
  N <- nrow(lot)
  sims <- 1000
  n_treat <- as.numeric(table(lot$Z)[2])
  W_sims <- numeric(sims)
  for(i in 1:sims){
    lot$Z_sim <- complete_ra(N, n_treat)
    fit_sim1 <- lm_robust(formula(paste("Z_sim~",paste(pre.treat,collapse="+"))), 
                          data = lot, 
                          se_type = "stata")
    W_sims[i] <- lmtest::waldtest(fit_sim1,  test = c("F"))[2,3]
  }
  p.permutation <- mean(W_sims >= W_obs)
  # Bootrsapped p-value o F-test
  cat("Simulated p-value:",p.permutation,"\n")
  # Compute balance variable by variable lmr
  ballmr <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat, FUN = est_lm,
                             ctrls = NULL, the.data=lot)) 
  rownames(ballmr) <- pre.treat
  print(ballmr)
  # Show balance plot for lottery lmr
  #assemble objects with balance stats
  bal.stats[[the.lotteries[ii]]] <- list(N=table(lot$Z),wald=the.wald,p.permutation=p.permutation,
                                         balr = ballmr)
} #end loop for lottery
save(bal.stats,file="Routputs/out-W2-balancebylottery.RData")


####################################################################################
# Balance Pooled                                                         ###########
####################################################################################
# Loop over lotteries  #
pol.bal.stats <- list() #object to store balance results
#Implments an F-test
the.formula <- formula(paste("Z~",paste(pre.treat,collapse="+")))
m1 <- lm_robust(the.formula, 
                data = surveyw2, clusters = fieldID,
                fixed_effects =~id_edital, 
                se_type = "stata")
#Standard F-test 
ftest <- m1$proj_fstatistic
p.value <- 0.8166

# Compute the p-value of the F-tests based on the permutation distribution of W
W_obs <- as.numeric(summary(m1)$proj_fstatistic[1]) #store f-test for boostrapped p-values 
N <- nrow(surveyw2)
sims <- 1000
n_treat <- as.numeric(table(surveyw2$Z)[2])
W_sims <- numeric(sims)
for(i in 1:sims){
  surveyw2$Z_sim <- complete_ra(N, n_treat)
  fit_sim1 <- lm_robust(formula(paste("Z_sim~",paste(pre.treat,collapse="+"))), 
                        data = surveyw2,
                        cluster = fieldID, 
                        fixed_effects =~id_edital,
                        se_type = "stata")
  W_sims[i] <- as.numeric(summary(fit_sim1)$proj_fstatistic[1])
}
p.permutation <- mean(W_sims >= W_obs)
# Bootrsapped p-value o F-test
cat("Simulated p-value:",p.permutation,"\n")

# Compute balance variable by variable lm_robust
ballmr <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat, FUN = est_lm,
                           ctrls= NULL,
                           fe = "id_edital", clusters = "fieldID",
                           the.data=surveyw2))
rownames(ballmr) <- pre.treat
print(ballmr)

# Compute balance variable by variable Lin
ballm <- bind_rows(lapply(1:length(pre.treat), outcomes=pre.treat, FUN = est_lin,
                          ctrls = "id_edital", clusters = "fieldID",
                          the.data=surveyw2))
rownames(ballm) <- pre.treat
print(ballm)

#assemble objects with balance stats

pol.bal.stats <- list(N=table(surveyw2$Z),wald=ftest,pvalue = p.value,
                      p.permutation=p.permutation,bal=ballm, balr=ballmr)
save(pol.bal.stats,file="Routputs/out-W2-balancepooled.RData")


####################################################################################
# Attrition Pooled                                                      ############
####################################################################################

#Data for attrition analysis
load("Data/W2_attrition_public.Rda")
print(comment(W2_attrition))
load("Data/W2_attrition_overall_public.Rda")
print(comment(W2_attrition_overall))

### Answered by treatment and control (overall contacted -- may or may not have answered)

attrition_overall <- lm_lin(interviewed ~ treat_ind, cluster = fieldID,
                            se_type = "stata", covariates = ~ id_edital,
                            data = W2_attrition_overall)

attrition_overallr <- lm_robust(interviewed ~ treat_ind, cluster = fieldID,
                            se_type = "stata", fixed_effects = ~ id_edital,
                            data = W2_attrition_overall)

### Answered by treatment and control (among those who answered)

attrition_answered <- lm_lin(interviewed ~ treat_ind, cluster = fieldID,
                             se_type = "stata", covariates = ~ id_edital,
                             data = W2_attrition)

attrition_answeredr <- lm_robust(interviewed ~ treat_ind, cluster = fieldID,
                             se_type = "stata", fixed_effects = ~ id_edital,
                             data = W2_attrition)

#assemble objects with attrition stats
pol.attrition.stats <- list(attrition_overall, 
                            attrition_overallr, 
                            attrition_answered, 
                            attrition_answeredr)

save(pol.attrition.stats,file="Routputs/out-W2-attrition.RData")

##### Remove missing for Wald Test
W2_attrition_overall2 <- W2_attrition_overall %>% filter(!is.na(sexor))
W2_attrition2 <- W2_attrition %>% filter(!is.na(sexor))

#Including interactions of covariates
attrition_overall_covi <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel + 
                                      treat_ind*formal_pre_prop + treat_ind*sexor + treat_ind*av_prel, 
                                    se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                    data = W2_attrition_overall2)
attrition_answered_covi <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel + 
                                       treat_ind*formal_pre_prop + treat_ind*sexor + treat_ind*av_prel, 
                                     se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                     data = W2_attrition2)
### No interactions

attrition_overall_cov <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel,
                                   se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                   data = W2_attrition_overall2)
attrition_answered_cov <- lm_robust(interviewed ~ treat_ind + formal_pre_prop + sexor + av_prel,
                                    se_type = "stata", fixed_effects = ~ id_edital, clusters = fieldID,
                                    data = W2_attrition2)

pol.attrition.joint.stats <- list(attrition_overall_covi, 
                                  attrition_answered_covi, 
                                  attrition_overall_cov, 
                                  attrition_answered_cov)

save(pol.attrition.joint.stats,file="Routputs/out-W2-attrition-joint.RData")


####################################################################################
# Analysis of Pooled Effects                                                  ######
# Estimation (3 estimators ITT-lin, ITT-lm, CACE-lm; 2 variants in each)      ######
# Summary tables (combining selected ITT and CACE)                            ######
####################################################################################

####  ITT-Lin: two variants  ####
#out.lin       & (sem idade e sem controles, com edital como covariate) 
#out.linc      & (com idade e com controles, com edital como covariate)
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital")
                        ,clusters="fieldID"
                        ,FUN = est_lin, the.data=surveyw2))
rownames(tmp) <- outcomes
out.lin <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lin$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lin[get(the.fam[ii]),"HBp"] <- p.adjust(out.lin[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= c("id_edital","idade",pre.treat)
                        ,clusters="fieldID"
                        ,FUN = est_lin, the.data=surveyw2))
rownames(tmp) <- outcomes
out.linc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.linc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.linc[get(the.fam[ii]),"HBp"] <- p.adjust(out.linc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

out.itt.lin <- list(out.lin=out.lin,out.linc=out.linc)  

####  ITT-Lm: two variants  #### 
#out.lm        & (sem controles, com edital como fixed effect) 
#out.lmc       & (com controles, com edital como fixed effect)(*) 
tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= NULL
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,FUN = est_lm, the.data=surveyw2))
rownames(tmp) <- outcomes
out.lm <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lm$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lm[get(the.fam[ii]),"HBp"] <- p.adjust(out.lm[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

tmp <- bind_rows(lapply(1:length(outcomes)
                        ,outcomes= outcomes
                        ,ctrls= pre.treat
                        ,fe = "id_edital"
                        ,clusters="fieldID"
                        ,FUN = est_lm, the.data=surveyw2))
rownames(tmp) <- outcomes
out.lmc <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmc$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmc[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmc[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}
out.itt.lm <- list(out.lm=out.lm,out.lmc=out.lmc)  

####  CACE-Lm: two variants  #### 
#out.lmcace        & (sem controles, com edital como fixed effect) 
#out.lmcacec       & (com controles, com edital como fixed effect)(*)   

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D" #remember to check with "D_survey" too!
                        ,the.data=surveyw2
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcace <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcace$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcace[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcace[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}

tmp <- bind_rows(lapply(1:length(outcomes),  outcomes=outcomes, treat="Z"
                        ,compliance="D" #remember to check with "D_survey" too!
                        ,the.data=surveyw2
                        ,ctrls=pre.treat
                        ,fe="id_edital"
                        ,clusters="fieldID"
                        ,FUN = est_cace)) 
rownames(tmp) <- outcomes
out.lmcacec <- as.data.frame(tmp)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                
#Compute HB corrected p-values by hypothesis family
out.lmcacec$HBp <- NA 
for(ii in 1:length(the.fam)){
  out.lmcacec[get(the.fam[ii]),"HBp"] <- p.adjust(out.lmcacec[get(the.fam[ii]),"Pr(>|t|)"],"BH")
}    

out.cace.lm <- list(out.lmcace=out.lmcace,out.lmcacec=out.lmcacec)  

### Collect all results (list of lists)
out.all <- list(out.itt.lm=out.itt.lm,out.itt.lin=out.itt.lin,out.cace.lm=out.cace.lm)
comment(out.all) <- paste("Estimated in ",Sys.Date(),". Three estimators, two variants in each.",sep="")
save(out.all,file="Routputs/out-W2-estimates-pooled.RData")


############ Attrition 


load("Data/W2_attrition_admin_overall_public.Rda")

admin_av <-  lm_robust(av_postl ~ treat_indf, cluster = fieldID,
          se_type = "stata", fixed_effects =~id_edital,
          data = data_admin)

admin_formal <- lm_robust(formal_post ~ treat_indf, cluster = fieldID,
          se_type = "stata", fixed_effects =~id_edital,
          data = data_admin)

### Same analysis with 

survey_av <- lm_robust(av_postl ~ Z, cluster = fieldID,
                    se_type = "stata", fixed_effects =~id_edital,
                    data = surveyw2)

survey_formal <- lm_robust(formal_post ~ Z, cluster = fieldID,
                       se_type = "stata", fixed_effects =~id_edital,
                       data = surveyw2)

#Comparing estimates
#Data cited
difftest(as.numeric(survey_av$coefficients), as.numeric(admin_av$coefficients), 
         as.numeric(survey_av$std.error), as.numeric(admin_av$std.error))

difftest(as.numeric(survey_formal$coefficients), as.numeric(admin_formal$coefficients), 
         as.numeric(survey_formal$std.error), as.numeric(admin_formal$std.error))

benchmarkW2 <- list(admin_av, admin_formal, survey_av, survey_formal)

save(benchmarkW2, file="Routputs/out-W2-benchmark.RData")

####################################################################################
# Estimates using IPW approach
####################################################################################

#Outcomes with controls
tmp_ipw <- bind_rows(lapply(1:length(outcomes)
                            ,outcomes= outcomes
                            ,ctrls= pre.treat
                            ,clusters="fieldID"
                            , w = surveyw2$ipw
                            ,FUN = est_ipw, the.data=surveyw2))
rownames(tmp_ipw) <- outcomes
out.ipwc <- as.data.frame(tmp_ipw)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                

#Outcomes without controls 
tmp_ipw <- bind_rows(lapply(1:length(outcomes)
                            ,outcomes= outcomes
                            ,ctrls= NULL
                            ,clusters="fieldID"
                            , w = surveyw2$ipw
                            ,FUN = est_ipw, the.data=surveyw2))
rownames(tmp_ipw) <- outcomes
out.ipw <- as.data.frame(tmp_ipw)[,c("Estimate","Std. Error","Pr(>|t|)","N","Std. Estimate")]                

out.all <- list(out.itt.ipwc=out.ipwc,
                out.itt.ipw=out.ipw)
comment(out.all) <- paste("Estimated in ",Sys.Date(),"IPW estimates with and without controls.",sep="")
save(out.all,file="Routputs/out-W2-estimates-ipw.RData")

###################### Data cited in paper

#number of respondents
length(unique(surveyw2$fieldID))

#Attribution MCMV
prop.table(table(surveyw2.unscaled$resp_lula_dilma_pt, surveyw2.unscaled$Z), 2)*100

#Support Lula
prop.table(table(surveyw2.unscaled$lula_eval, surveyw2.unscaled$Z), 2)*100
mean(surveyw2.unscaled$lula_eval, na.rm = T)

#homeowners
difference_in_means(homeowner ~ Z, data = surveyw2)
prop.table(table(surveyw2$homeowner, surveyw2$D), 2)*100

#identification with PT
prop.table(table(surveyw2.unscaled$party_pt_r, surveyw2.unscaled$Z), 2)*100

#support MCMVV
prop.table(table(surveyw2$improved_lives, surveyw2$D), 2)*100
